# Computations
import numpy as np
import pandas as pd
import scipy.stats as stats
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
# Pytorch
import torch
from torch.autograd import Variable
import torch.nn as nn
import torchvision.transforms as transforms
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Data = pd.read_csv('pima-indians-diabetes-database/diabetes_mod.csv')
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
| Pregnancies | Glucose | Blood Pressure | Skin Thickness | Insulin | BMI | Diabetes Pedigree Function | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.0 | 148.0 | 72.0 | 35.0 | 0.0 | 33.6 | 0.627 | 50.0 | 1 |
| 1 | 1.0 | 85.0 | 66.0 | 29.0 | 0.0 | 26.6 | 0.351 | 31.0 | 0 |
| 2 | 8.0 | 183.0 | 64.0 | 0.0 | 0.0 | 23.3 | 0.672 | 32.0 | 1 |
| 3 | 1.0 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21.0 | 0 |
| 4 | 0.0 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 0.254 | 33.0 | 1 |
| Number of Instances | Number of Attributes |
|---|---|
| 768 | 9 |
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| Blood Pressure | Diastolic blood pressure (mm Hg) |
| Skin Thickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| Diabetes Pedigree Function | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
We did go through PyTorch ANN in this article, and interested readers are encouraged to refer to that article.
Splitting the data into X and y sets:
Target = 'Outcome'
X = Data.drop(columns = [Target])
y = Data[Target]
Labels = ['Non-Diabetic', 'Diabetic']
fig, ax = plt.subplots(figsize=(10,10))
Temp = X.var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("OrRd", 20),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 12.5},
cbar_kws={'label': 'Feature Variance', "aspect":80, "shrink": .4, "orientation": "horizontal"})
lb = [x.replace(' ','\n').replace('\nof\n',' of\n') for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(lb)
_ = ax.set_yticklabels('')
del Temp
We can standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
Temp = scaler.fit_transform(X)
X = pd.DataFrame(data = Temp, columns = X.columns)
del Temp
fig, ax = plt.subplots(figsize=(10,10))
Temp = X.var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("Greens", 20),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 12.5},
cbar_kws={'label': 'Feature Variance', "aspect":80, "shrink": .4, "orientation": "horizontal"})
lb = [x.replace(' ','\n').replace('\nof\n',' of\n') for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(lb)
_ = ax.set_yticklabels('')
del Temp
X = X.values
y = y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
| Set | X_train | X_test | y_train | y_test |
|---|---|---|---|---|
| Shape | (537, 8) | (231, 8) | (537,) | (231,) |
# Train and Test Tests
if torch.cuda.is_available():
X_train_tensor = Variable(torch.from_numpy(X_train).cuda())
y_train_tensor = Variable(torch.from_numpy(y_train).type(torch.LongTensor).cuda())
X_test_tensor = Variable(torch.from_numpy(X_test).cuda())
y_test_tensor = Variable(torch.from_numpy(y_test).type(torch.LongTensor).cuda())
else:
X_train_tensor = Variable(torch.from_numpy(X_train))
y_train_tensor = Variable(torch.from_numpy(y_train).type(torch.LongTensor))
X_test_tensor = Variable(torch.from_numpy(X_test))
y_test_tensor = Variable(torch.from_numpy(y_test).type(torch.LongTensor))
Batch_size = 100
iteration_number = 4e3
epochs_number = int(iteration_number / (len(X_train) / Batch_size))
# Pytorch train and test sets
Train_set = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
Test_set = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
# data loader
train_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
# Create ANN Model
class ANNModel(nn.Module):
def __init__(self, input_Size, hidden_Size, output_Size):
super(ANNModel, self).__init__()
# Linear function 1:
self.fc1 = nn.Linear(input_Size, hidden_Size)
# Non-linearity 1
self.relu1 = nn.ReLU()
# Linear function 2:
self.fc2 = nn.Linear(hidden_Size, hidden_Size)
# Non-linearity 2
self.tanh2 = nn.Tanh()
# Linear function 3:
self.fc3 = nn.Linear(hidden_Size, hidden_Size)
# Non-linearity 3
self.elu3 = nn.ELU()
# Linear function 4:
self.fc4 = nn.Linear(hidden_Size, output_Size)
def forward(self, x):
# Linear function 1
out = self.fc1(x)
# Non-linearity 1
out = self.relu1(out)
# Linear function 2
out = self.fc2(out)
# Non-linearity 2
out = self.tanh2(out)
# Linear function 2
out = self.fc3(out)
# Non-linearity 2
out = self.elu3(out)
# Linear function 4 (readout)
out = self.fc4(out)
return out
input_Size, output_Size = len(X[0]), len(np.unique(y))
hidden_Size = 100
# model
model = ANNModel(input_Size, hidden_Size, output_Size)
# GPU
if torch.cuda.is_available():
model.cuda()
# Cross Entropy Loss
CEL= nn.CrossEntropyLoss()
# Optimizer
learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Traning the Model
Count = 0
Loss_list = []
Iteration_list = []
Accuracy_list = []
MSE_list = []
MAE_list = []
Steps = 10
Progress_Bar = progressbar.ProgressBar(maxval= iteration_number+1000,
widgets=[progressbar.Bar('=', '|', '|'),
progressbar.Percentage()])
# print('---------------------------------------------------------')
for epoch in range(epochs_number):
for i, (Xtr, ytr) in enumerate(train_loader):
# Variables
Xtr = Variable(Xtr.view(-1, X[0].shape[0]))
ytr = Variable(ytr)
# Set all gradients to zero
optimizer.zero_grad()
# Forward
Out = model(Xtr.float())
# loss
loss = CEL(Out, ytr.long())
# Backward (Calculating the gradients)
loss.backward()
# Update parameters
optimizer.step()
Count += 1
del Xtr, ytr
# Predictions
if Count % Steps == 0:
# Calculate Accuracy
Correct, Total = 0, 0
# Predictions
for Xts, yts in test_loader:
Xts = Variable(Xts.view(-1, X[0].shape[0]))
# Forward
Out = model(Xts.float())
# The maximum value of Out
Predicted = torch.max(Out.data, 1)[1]
# Total number of yts
Total += len(yts)
# Total Correct predictions
Correct += (Predicted == yts).sum()
del Xts, yts
# storing loss and iteration
Loss_list.append(loss.data)
Iteration_list.append(Count)
Accuracy_list.append(Correct / float(Total))
Progress_Bar.update(Count)
Progress_Bar.finish()
history = pd.DataFrame({'Iteration': np.array(Iteration_list),
'Loss': np.array([x.cpu().data.numpy() for x in Loss_list]),
'Accuracy': np.array([x.cpu().data.numpy() for x in Accuracy_list])})
del Loss_list, Iteration_list, Accuracy_list
|=========================================================================|100%
def Plot_history(history, Table_Rows = 25, yLim = 2):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "scatter"},{"type": "table"}]])
# Left
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Loss'].astype(float).values.round(4),
line=dict(color='OrangeRed', width= 1.5), name = 'Loss'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Accuracy'].astype(float).values,
line=dict(color='MidnightBlue', width= 1.5), name = 'Accuracy'), 1, 1)
fig.update_layout(legend=dict(x=0, y=1.1, traceorder='reversed', font_size=12),
dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest',
legend_orientation='h')
fig.update_xaxes(range=[history.Iteration.min(), history.Iteration.max()],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
fig.update_yaxes(range=[0, yLim], showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
# Right
ind = np.linspace(0, history.shape[0], Table_Rows, endpoint = False).round(0).astype(int)
ind = np.append(ind, history.Iteration.values[-1])
history = history[history.index.isin(ind)]
fig.add_trace(go.Table(header=dict(values = list(history.columns), line_color='darkslategray',
fill_color='DimGray', align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.4, 0.4, 0.4],
cells=dict(values=[history.Iteration, history.Loss.astype(float).round(4).values,
history.Accuracy.astype(float).round(4).values],
line_color='darkslategray', fill=dict(color=['WhiteSmoke', 'white']),
align=['center', 'center'], font_size=12,height=20)), 1, 2)
fig.show()
def Confusion_Matrix(Model, FG = (12, 4), X_train_tensor = X_train_tensor, y_train = y_train,
X_test_tensor = X_test_tensor, y_test = y_test):
font = FontProperties()
font.set_weight('bold')
############# Train Set #############
fig, ax = plt.subplots(1, 2, figsize=FG)
_ = fig.suptitle('Train Set', fontproperties=font, fontsize = 16)
# Predictions
y_pred = model(X_train_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
# confusion matrix
CM = metrics.confusion_matrix(y_train, y_pred)
_ = sns.heatmap(CM.round(2), annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0])
_ = ax[0].set_title('Confusion Matrix')
CM = CM.astype('float') / CM.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(CM.round(2), annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix')
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels')
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
############# Test Set #############
fig, ax = plt.subplots(1, 2, figsize=FG)
_ = fig.suptitle('Test Set', fontproperties=font, fontsize = 16)
font = FontProperties()
font.set_weight('bold')
# Predictions
y_pred = model(X_test_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
# confusion matrix
CM = metrics.confusion_matrix(y_test, y_pred)
_ = sns.heatmap(CM.round(2), annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0])
_ = ax[0].set_title('Confusion Matrix')
CM = CM.astype('float') / CM.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(CM.round(2), annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix')
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels')
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
Plot_history(history, yLim = 1.0)
Confusion_Matrix(model)